APPLIED MACHINE LEARNING ASSIGNMENT 01¶

Dataset: Employee Data Analysis¶

In [206]:
# Data Analysis Phase
## MAin aim is to understand more about the data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import seaborn as sns
## Display all the columns of the dataframe

pd.pandas.set_option('display.max_columns',None)
In [207]:
first_data=pd.read_csv(r"C:\Users\Bilal\Desktop\AML Assignment\world_bank_development_indicators.csv")
df=first_data.copy()
## print shape of dataset with  rows and columns
print(df.shape)
(16780, 50)
In [208]:
# Handling missing values

# numeric_columns = df.select_dtypes(include=['number']).columns

# numeric_columns = [col for col in numeric_columns if col != 'country' and col !='date']

# for column in numeric_columns:
#     median_value = df[column].median()
#     df[column].fillna(median_value, inplace=True)
In [209]:
df
Out[209]:
country date agricultural_land% forest_land% land_area avg_precipitation trade_in_services% control_of_corruption_estimate control_of_corruption_std access_to_electricity% renewvable_energy_consumption% electric_power_consumption CO2_emisions other_greenhouse_emisions population_density inflation_annual% real_interest_rate risk_premium_on_lending research_and_development_expenditure% central_goverment_debt% tax_revenue% expense% goverment_effectiveness_estimate goverment_effectiveness_std human_capital_index doing_business time_to_get_operation_license statistical_performance_indicators individuals_using_internet% logistic_performance_index military_expenditure% GDP_current_US political_stability_estimate political_stability_std rule_of_law_estimate rule_of_law_std regulatory_quality_estimate regulatory_quality_std government_expenditure_on_education% government_health_expenditure% multidimensional_poverty_headcount_ratio% gini_index birth_rate death_rate life_expectancy_at_birth population rural_population voice_and_accountability_estimate voice_and_accountability_std intentional_homicides
0 Afghanistan 1/1/1960 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 537777811.10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 50.34 31.92 32.53 8622466.00 7898093.00 NaN NaN NaN
1 Afghanistan 1/1/1961 57.80 NaN 652230.00 327.00 NaN NaN NaN NaN NaN NaN NaN NaN 13.48 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 548888895.60 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 50.44 31.35 33.07 8790140.00 8026804.00 NaN NaN NaN
2 Afghanistan 1/1/1962 57.89 NaN 652230.00 327.00 NaN NaN NaN NaN NaN NaN NaN NaN 13.75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 546666677.80 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 50.57 30.84 33.55 8969047.00 8163985.00 NaN NaN NaN
3 Afghanistan 1/1/1963 57.97 NaN 652230.00 327.00 NaN NaN NaN NaN NaN NaN NaN NaN 14.04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 751111191.10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 50.70 30.36 34.02 9157465.00 8308019.00 NaN NaN NaN
4 Afghanistan 1/1/1964 58.07 NaN 652230.00 327.00 NaN NaN NaN NaN NaN NaN NaN NaN 14.34 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 800000044.40 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 50.83 29.87 34.49 9355514.00 8458694.00 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16775 Zimbabwe 1/1/2018 41.88 45.33 386850.00 657.00 4.47 -1.23 0.13 45.40 80.43 NaN 11069.90 29374.29 38.91 10.62 -64.38 NaN NaN NaN 7.21 10.98 -1.26 0.17 0.46 NaN NaN 59.51 25.00 2.12 0.31 34156069918.00 -0.72 0.22 -1.28 0.14 -1.51 0.16 3.87 1.59 NaN NaN 32.07 7.97 61.41 15052184.00 10204026.00 -1.14 0.12 4.88
16776 Zimbabwe 1/1/2019 41.88 45.21 386850.00 657.00 6.93 -1.27 0.14 46.68 81.52 NaN 10185.30 28697.16 39.69 255.30 -64.30 NaN NaN NaN NaN NaN -1.27 0.18 NaN 140.00 NaN 61.65 26.59 NaN 0.53 21832234926.00 -0.96 0.23 -1.29 0.14 -1.47 0.16 NaN 0.52 NaN 50.30 31.52 8.04 61.29 15354608.00 10408889.00 -1.16 0.12 5.15
16777 Zimbabwe 1/1/2020 41.88 45.09 386850.00 657.00 5.12 -1.29 0.14 52.75 84.36 NaN 8312.50 25988.13 40.51 557.20 -81.13 NaN NaN NaN NaN NaN -1.30 0.21 0.47 NaN NaN NaN 29.30 NaN 0.01 21509698406.00 -1.07 0.24 -1.31 0.14 -1.42 0.17 NaN 0.76 NaN NaN 31.01 8.13 61.12 15669666.00 10617452.00 -1.11 0.12 4.98
16778 Zimbabwe 1/1/2021 NaN NaN NaN NaN NaN -1.26 0.15 48.98 NaN NaN NaN NaN NaN 98.55 -31.80 NaN NaN NaN NaN NaN -1.24 0.21 NaN NaN NaN NaN 34.81 NaN 0.01 28371238666.00 -1.03 0.24 -1.26 0.15 -1.37 0.16 NaN NaN NaN NaN 30.54 9.06 59.25 15993524.00 10827136.00 -1.14 0.12 6.14
16779 Zimbabwe 1/1/2022 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 104.71 -18.32 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.50 NaN 20678055598.00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16320537.00 11033499.00 NaN NaN NaN

16780 rows × 50 columns

The dataset used for this assignment is the World Bank Development Indicators dataset which can be accessed from: https://www.kaggle.com/datasets/nicolasgonzalezmunoz/world-bank-world-development-indicators¶

Task 1.¶

Data Summarization:¶

Calculate basic summary statistics (mean, median, standard deviation, etc.) for each numerical variable.¶

Count the frequency of unique values for categorical variables.¶

Calculate the number of missing values for each variable.¶

Calculating missing values in the cells¶

In [210]:
dataset=df.copy()
features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean(), 2),  '\b% missing values')

print()
print()
print()

    
features_with_nan = [feature for feature in dataset.columns if dataset[feature].isna().sum() > 1]

for feature in features_with_nan:
    print(f"{feature}: {np.round(dataset[feature].isna().mean() * 100, 2)}% NaN values")
agricultural_land% 0.09 % missing values
forest_land% 0.53 % missing values
land_area 0.07 % missing values
avg_precipitation 0.4 % missing values
trade_in_services% 0.45 % missing values
control_of_corruption_estimate 0.73 % missing values
control_of_corruption_std 0.73 % missing values
access_to_electricity% 0.56 % missing values
renewvable_energy_consumption% 0.52 % missing values
electric_power_consumption 0.54 % missing values
CO2_emisions 0.56 % missing values
other_greenhouse_emisions 0.56 % missing values
population_density 0.07 % missing values
inflation_annual% 0.36 % missing values
real_interest_rate 0.74 % missing values
risk_premium_on_lending 0.86 % missing values
research_and_development_expenditure% 0.83 % missing values
central_goverment_debt% 0.88 % missing values
tax_revenue% 0.7 % missing values
expense% 0.72 % missing values
goverment_effectiveness_estimate 0.73 % missing values
goverment_effectiveness_std 0.73 % missing values
human_capital_index 0.96 % missing values
doing_business 0.99 % missing values
time_to_get_operation_license 0.98 % missing values
statistical_performance_indicators 0.96 % missing values
individuals_using_internet% 0.52 % missing values
logistic_performance_index 0.92 % missing values
military_expenditure% 0.41 % missing values
GDP_current_US 0.2 % missing values
political_stability_estimate 0.72 % missing values
political_stability_std 0.72 % missing values
rule_of_law_estimate 0.72 % missing values
rule_of_law_std 0.72 % missing values
regulatory_quality_estimate 0.73 % missing values
regulatory_quality_std 0.73 % missing values
government_expenditure_on_education% 0.65 % missing values
government_health_expenditure% 0.71 % missing values
multidimensional_poverty_headcount_ratio% 0.97 % missing values
gini_index 0.88 % missing values
birth_rate 0.04 % missing values
death_rate 0.05 % missing values
life_expectancy_at_birth 0.05 % missing values
population 0.01 % missing values
rural_population 0.01 % missing values
voice_and_accountability_estimate 0.72 % missing values
voice_and_accountability_std 0.72 % missing values
intentional_homicides 0.75 % missing values



agricultural_land%: 8.83% NaN values
forest_land%: 52.84% NaN values
land_area: 6.98% NaN values
avg_precipitation: 39.89% NaN values
trade_in_services%: 45.38% NaN values
control_of_corruption_estimate: 72.8% NaN values
control_of_corruption_std: 72.8% NaN values
access_to_electricity%: 56.21% NaN values
renewvable_energy_consumption%: 51.87% NaN values
electric_power_consumption: 53.58% NaN values
CO2_emisions: 55.85% NaN values
other_greenhouse_emisions: 55.85% NaN values
population_density: 7.16% NaN values
inflation_annual%: 35.79% NaN values
real_interest_rate: 73.8% NaN values
risk_premium_on_lending: 85.89% NaN values
research_and_development_expenditure%: 83.46% NaN values
central_goverment_debt%: 87.67% NaN values
tax_revenue%: 69.73% NaN values
expense%: 71.79% NaN values
goverment_effectiveness_estimate: 72.94% NaN values
goverment_effectiveness_std: 72.94% NaN values
human_capital_index: 96.42% NaN values
doing_business: 98.87% NaN values
time_to_get_operation_license: 97.84% NaN values
statistical_performance_indicators: 95.89% NaN values
individuals_using_internet%: 52.47% NaN values
logistic_performance_index: 91.62% NaN values
military_expenditure%: 40.9% NaN values
GDP_current_US: 20.35% NaN values
political_stability_estimate: 72.5% NaN values
political_stability_std: 72.5% NaN values
rule_of_law_estimate: 72.26% NaN values
rule_of_law_std: 72.26% NaN values
regulatory_quality_estimate: 72.93% NaN values
regulatory_quality_std: 72.93% NaN values
government_expenditure_on_education%: 64.54% NaN values
government_health_expenditure%: 70.57% NaN values
multidimensional_poverty_headcount_ratio%: 97.06% NaN values
gini_index: 88.46% NaN values
birth_rate: 4.43% NaN values
death_rate: 4.54% NaN values
life_expectancy_at_birth: 5.45% NaN values
population: 0.69% NaN values
rural_population: 1.44% NaN values
voice_and_accountability_estimate: 72.3% NaN values
voice_and_accountability_std: 72.3% NaN values
intentional_homicides: 74.92% NaN values
In [211]:
CountryWiseData=dataset.groupby('country').mean()
CountryWiseData
Out[211]:
agricultural_land% forest_land% land_area avg_precipitation trade_in_services% control_of_corruption_estimate control_of_corruption_std access_to_electricity% renewvable_energy_consumption% electric_power_consumption CO2_emisions other_greenhouse_emisions population_density inflation_annual% real_interest_rate risk_premium_on_lending research_and_development_expenditure% central_goverment_debt% tax_revenue% expense% goverment_effectiveness_estimate goverment_effectiveness_std human_capital_index doing_business time_to_get_operation_license statistical_performance_indicators individuals_using_internet% logistic_performance_index military_expenditure% GDP_current_US political_stability_estimate political_stability_std rule_of_law_estimate rule_of_law_std regulatory_quality_estimate regulatory_quality_std government_expenditure_on_education% government_health_expenditure% multidimensional_poverty_headcount_ratio% gini_index birth_rate death_rate life_expectancy_at_birth population rural_population voice_and_accountability_estimate voice_and_accountability_std intentional_homicides
country
Afghanistan 58.18 1.85 652230.00 327.00 12.65 -1.43 0.22 55.82 27.04 NaN 4821.06 20799.12 27.34 6.12 10.00 NaN NaN NaN 7.80 41.62 -1.50 0.26 0.39 173.00 13.75 44.85 4.76 1.97 1.65 7717726754.86 -2.48 0.30 -1.72 0.21 -1.50 0.23 2.69 0.54 50.55 NaN 47.67 18.23 47.92 18410104.44 14513617.44 -1.30 0.16 6.02
Africa Eastern and Southern 43.69 33.45 14632485.85 NaN 9.83 NaN NaN 30.99 63.33 716.08 452640.56 1116224.04 23.42 10.38 NaN NaN 0.62 NaN 17.78 24.07 NaN NaN NaN NaN 13.24 NaN 6.43 2.48 2.52 366730373780.84 NaN NaN NaN NaN NaN NaN 4.39 2.55 NaN NaN 42.79 14.90 51.89 351919799.95 251054684.00 NaN NaN 11.97
Africa Western and Central 35.83 21.19 9045959.88 NaN 9.66 NaN NaN 41.27 81.26 127.28 153913.71 643892.33 25.87 4.64 NaN NaN 0.15 NaN NaN NaN NaN NaN NaN NaN 32.29 NaN 9.42 2.41 1.79 258364273935.03 NaN NaN NaN NaN NaN NaN 2.82 0.80 NaN NaN 44.23 17.59 48.33 239756352.51 153870196.38 NaN NaN 9.83
Albania 42.15 28.50 27400.00 1485.00 23.31 -0.68 0.17 99.80 40.24 1167.18 3881.95 8614.21 99.59 15.06 2.44 6.20 0.12 66.81 16.43 24.20 -0.31 0.23 0.61 82.00 14.77 71.14 25.38 2.48 2.79 7480345027.69 -0.09 0.27 -0.55 0.17 0.02 0.21 3.37 2.62 47.60 31.01 22.85 7.73 71.50 2713063.16 1574154.24 0.03 0.15 6.40
Algeria 17.45 0.74 2381740.10 89.00 6.83 -0.64 0.18 99.06 0.32 599.61 105761.24 193574.39 10.72 8.60 1.90 5.82 0.26 NaN NaN NaN -0.56 0.21 0.53 157.00 19.30 48.79 16.35 2.46 3.15 71794434279.38 -1.23 0.26 -0.84 0.17 -0.96 0.23 6.45 3.52 NaN 34.37 33.52 10.73 61.42 25915130.46 10791909.30 -0.98 0.14 1.17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
West Bank and Gaza 79.33 1.59 6020.00 402.00 16.51 -0.26 0.28 99.54 17.25 NaN NaN NaN 558.20 2.96 2.84 NaN 0.30 NaN 8.36 12.43 -0.82 0.33 0.57 117.00 16.77 67.16 27.35 NaN NaN 9172903448.28 -1.77 0.32 -0.37 0.27 -0.38 0.35 5.11 NaN 24.00 34.52 37.09 4.29 71.79 3458736.48 907983.03 -0.87 0.19 1.35
World 36.36 31.78 129717210.21 NaN 9.53 NaN NaN 83.60 17.34 2104.54 28173346.98 38928765.52 41.34 5.36 NaN NaN 2.06 NaN 14.19 26.53 NaN NaN NaN NaN 31.47 NaN 21.50 2.87 3.47 32366182539682.54 NaN NaN NaN NaN NaN NaN 4.15 5.66 NaN NaN 25.52 9.89 64.60 5407739439.19 2920233053.79 NaN NaN 6.17
Yemen, Rep. 44.52 1.04 527970.00 167.00 10.41 -1.22 0.18 56.89 1.37 114.12 15957.08 33225.43 29.21 17.50 7.13 4.18 NaN NaN NaN NaN -1.33 0.22 0.37 187.00 6.75 39.09 6.75 2.40 5.63 18230152874.03 -2.11 0.26 -1.37 0.17 -1.00 0.21 6.49 1.56 NaN 35.47 46.35 14.83 54.78 15833245.87 11515379.17 -1.29 0.14 4.59
Zambia 28.79 62.69 743390.00 1020.00 11.79 -0.56 0.16 24.77 86.38 843.76 3320.69 28515.98 11.84 36.44 -0.49 3.77 0.05 105.35 15.50 18.25 -0.80 0.18 0.39 85.00 30.63 57.93 4.26 2.41 1.87 8035327954.94 0.21 0.26 -0.43 0.16 -0.55 0.19 3.79 1.83 NaN 52.69 45.67 13.65 52.34 9055472.62 5573550.30 -0.29 0.13 7.39
Zimbabwe 35.50 46.88 386850.00 657.58 9.63 -1.23 0.16 37.57 75.06 842.78 12378.40 29168.66 24.68 79.61 -20.57 NaN NaN NaN 14.83 18.91 -1.17 0.18 0.44 140.00 7.00 53.54 8.33 2.31 3.25 8333485469.06 -0.95 0.26 -1.50 0.15 -1.70 0.18 9.08 1.75 NaN 45.93 39.59 11.92 54.74 9667335.21 6857547.03 -1.30 0.13 8.34

268 rows × 48 columns

In [212]:
# Selecting columns which have only numerical data
numerical_columns = dataset.select_dtypes(include=[np.number])
non_numerical_columns = dataset.select_dtypes(exclude=[np.number])
In [213]:
print("The Columns which have numerical values are")
numerical_columns.columns.tolist()
The Columns which have numerical values are
Out[213]:
['agricultural_land%',
 'forest_land%',
 'land_area',
 'avg_precipitation',
 'trade_in_services%',
 'control_of_corruption_estimate',
 'control_of_corruption_std',
 'access_to_electricity%',
 'renewvable_energy_consumption%',
 'electric_power_consumption',
 'CO2_emisions',
 'other_greenhouse_emisions',
 'population_density',
 'inflation_annual%',
 'real_interest_rate',
 'risk_premium_on_lending',
 'research_and_development_expenditure%',
 'central_goverment_debt%',
 'tax_revenue%',
 'expense%',
 'goverment_effectiveness_estimate',
 'goverment_effectiveness_std',
 'human_capital_index',
 'doing_business',
 'time_to_get_operation_license',
 'statistical_performance_indicators',
 'individuals_using_internet%',
 'logistic_performance_index',
 'military_expenditure%',
 'GDP_current_US',
 'political_stability_estimate',
 'political_stability_std',
 'rule_of_law_estimate',
 'rule_of_law_std',
 'regulatory_quality_estimate',
 'regulatory_quality_std',
 'government_expenditure_on_education%',
 'government_health_expenditure%',
 'multidimensional_poverty_headcount_ratio%',
 'gini_index',
 'birth_rate',
 'death_rate',
 'life_expectancy_at_birth',
 'population',
 'rural_population',
 'voice_and_accountability_estimate',
 'voice_and_accountability_std',
 'intentional_homicides']
In [214]:
print("The Columns which have non-numerical values are")
non_numerical_columns.columns.tolist()
The Columns which have non-numerical values are
Out[214]:
['country', 'date']

Calculating basic summary statistis (mean, median, standard deviation, etc.) for each numerical variable¶

In [215]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
summary_stats = pd.DataFrame({
    'Mean': numerical_columns.mean(),
    'Median': numerical_columns.median(),
    'Std Dev': numerical_columns.std(),
    'Min': numerical_columns.min(),
    'Max': numerical_columns.max()
})

summary_stats
Out[215]:
Mean Median Std Dev Min Max
agricultural_land% 37.53 37.69 20.54 0.26 93.44
forest_land% 32.43 30.77 23.38 0.00 98.57
land_area 5250932.58 199810.00 15098866.06 2.03 129987020.90
avg_precipitation 1214.02 1110.00 810.57 18.10 3240.00
trade_in_services% 20.82 13.32 23.10 0.62 316.32
control_of_corruption_estimate -0.03 -0.26 1.00 -1.92 2.46
control_of_corruption_std 0.21 0.17 0.10 0.11 0.99
access_to_electricity% 80.76 98.29 28.75 0.53 100.00
renewvable_energy_consumption% 31.03 20.99 29.86 0.00 98.34
electric_power_consumption 2885.32 1331.11 4103.21 5.55 54799.17
CO2_emisions 1023985.81 23834.75 3343747.34 0.00 35560555.79
other_greenhouse_emisions 1479214.81 50372.71 4595983.69 7.62 48089616.91
population_density 277.14 51.72 1447.28 0.10 21594.80
inflation_annual% 20.03 4.91 291.46 -17.64 23773.13
real_interest_rate 5.43 5.48 15.60 -97.69 628.32
risk_premium_on_lending 5.92 4.61 7.14 -31.50 67.84
research_and_development_expenditure% 1.04 0.73 0.92 0.01 5.44
central_goverment_debt% 64.28 51.28 83.58 -1.17 2002.51
tax_revenue% 17.27 15.62 12.59 0.00 177.28
expense% 28.36 25.65 25.23 0.00 378.49
goverment_effectiveness_estimate -0.03 -0.18 0.99 -2.45 2.43
goverment_effectiveness_std 0.24 0.22 0.08 0.16 1.06
human_capital_index 0.57 0.57 0.15 0.29 0.89
doing_business 95.93 96.00 54.82 1.00 190.00
time_to_get_operation_license 31.25 22.30 29.16 1.20 176.10
statistical_performance_indicators 61.11 59.58 17.67 11.77 90.29
individuals_using_internet% 23.97 8.00 29.39 0.00 100.00
logistic_performance_index 2.84 2.69 0.55 1.21 4.30
military_expenditure% 2.80 2.08 2.89 0.00 117.35
GDP_current_US 1206984758444.87 18427777778.00 5412748059129.95 8824746.24 100562000000000.00
political_stability_estimate -0.02 0.07 1.00 -3.31 1.97
political_stability_std 0.28 0.25 0.08 0.19 0.66
rule_of_law_estimate -0.03 -0.17 1.00 -2.59 2.12
rule_of_law_std 0.21 0.17 0.11 0.12 0.92
regulatory_quality_estimate -0.03 -0.15 0.99 -2.55 2.26
regulatory_quality_std 0.24 0.21 0.08 0.15 1.08
government_expenditure_on_education% 4.33 4.13 1.93 0.00 44.33
government_health_expenditure% 3.23 2.64 2.27 0.06 22.25
multidimensional_poverty_headcount_ratio% 26.94 24.60 11.23 2.37 74.20
gini_index 37.97 35.80 8.96 20.70 65.80
birth_rate 28.20 27.07 12.86 5.00 58.12
death_rate 10.49 9.20 5.36 0.80 103.53
life_expectancy_at_birth 64.25 66.78 11.11 11.99 85.50
population 215965715.82 6787419.00 710295606.01 2646.00 7951149546.00
rural_population 123097056.33 3148533.00 408755719.95 0.00 3435440919.00
voice_and_accountability_estimate -0.02 0.02 1.00 -2.31 1.80
voice_and_accountability_std 0.17 0.14 0.07 0.10 0.73
intentional_homicides 8.04 3.43 12.03 0.00 138.77

Frequency of unique values in the country's column¶

In [216]:
for column in ['country']:
    frequencies = dataset[column].value_counts()
    print(f"Frequencies for {column}:\n{frequencies}\n")
Frequencies for country:
Afghanistan       63
Norway            63
Mozambique        63
Myanmar           63
Namibia           63
                  ..
Guatemala         63
Guinea            63
Zimbabwe          63
Turkey            11
Czech Republic    11
Name: country, Length: 268, dtype: int64

Task 2.¶

Data Visualization:¶

Create histograms or density plots to visualize the distribution of numerical variables.¶

Generate bar plots or pie charts to visualize the distribution of categorical variables.¶

Create box plots to identify outliers and understand the spread of data.¶

Construct scatter plots to explore relationships between pairs¶

In [217]:
# creating scatter plots of features related to consumptions
different_standards_features = [feature for feature in numerical_columns if 'std' in feature]
In [218]:
# Listing down unique features across various standards in the dataset
for feature in different_standards_features:
    print(feature, numerical_columns[feature].unique())
control_of_corruption_std [       nan 0.34050697 0.324013   ... 0.15330967 0.15605473 0.13374464]
goverment_effectiveness_std [       nan 0.18761755 0.30231553 ... 0.24523363 0.20191254 0.18246593]
political_stability_std [       nan 0.4748072  0.4352209  0.45390606 0.43629751 0.34964156
 0.30459777 0.30333257 0.28942022 0.30810529 0.30981806 0.29300761
 0.28206983 0.27388433 0.24388154 0.24574125 0.20889461 0.21560416
 0.219708   0.22803602 0.22710179 0.24134798 0.24531512 0.24781153
 0.42171598 0.38143334 0.39620549 0.39288157 0.33050954 0.3160013
 0.30034962 0.273783   0.26269111 0.26359117 0.2647393  0.26047391
 0.25086629 0.22198433 0.22801979 0.19638619 0.20298342 0.20779568
 0.21594997 0.21523696 0.23301464 0.23612024 0.23941578 0.37391231
 0.34566841 0.34250641 0.31601527 0.29205847 0.28685582 0.27811614
 0.25285429 0.24429454 0.24604777 0.24605513 0.2435471  0.24424958
 0.22680736 0.23150824 0.20011306 0.20681481 0.21260111 0.22127773
 0.22028312 0.23851493 0.24132012 0.42519823 0.44368258 0.42395952
 0.38278705 0.38323367 0.37083346 0.39340416 0.41133508 0.34492502
 0.39368939 0.29147407 0.31012049 0.32092151 0.35220391 0.30930543
 0.31755972 0.34252042 0.32860956 0.61303073 0.57573354 0.5952037
 0.51814151 0.36473665 0.38434094 0.3764399  0.34842056 0.35001713
 0.34126225 0.35239929 0.36077529 0.3173849  0.34742737 0.28007093
 0.29093313 0.3153305  0.29657856 0.28803214 0.2856389  0.28867644
 0.28687262 0.26761767 0.2710602  0.25199843 0.23303711 0.21053849
 0.21616668 0.21814875 0.21937421 0.21829574 0.22347608 0.21501607
 0.33064833 0.33286983 0.33175615 0.32360187 0.32008725 0.3066951
 0.28274652 0.29725292 0.23805772 0.24594994 0.24802686 0.2581048
 0.25412327 0.27849552 0.27712518 0.280121   0.31338027 0.28604171
 0.28284657 0.2789576  0.2701239  0.24576741 0.2392498  0.24130794
 0.24153912 0.23933095 0.23852235 0.21966553 0.22578736 0.19224741
 0.19816442 0.2049565  0.21172263 0.21109025 0.22715022 0.22972222
 0.23361281 0.25491926 0.24469157 0.36922431 0.36291722 0.33110604
 0.33214673 0.31109184 0.30182099 0.29413733 0.26799193 0.27936524
 0.23565063 0.24116345 0.24432918 0.25488386 0.2518363  0.27138987
 0.27081415 0.27579004 0.2648508  0.25644869 0.25778809 0.20627439
 0.20702998 0.20948239 0.21417129 0.21078306 0.4094311  0.30158466
 0.29303238 0.31146678 0.31688541 0.29089096 0.29207346 0.27218479
 0.26757622 0.26071763 0.24220166 0.24786642 0.21057026 0.21140938
 0.21511243 0.22351009 0.22344395 0.23854543 0.2246238  0.22422254
 0.23850638 0.30092695 0.24067757 0.32248533 0.30065388 0.29880321
 0.28377882 0.27378613 0.26718533 0.24639824 0.25246668 0.21239938
 0.22120954 0.22024554 0.23307268 0.23217934 0.24750973 0.27938786
 0.22781482 0.23446649 0.20155461 0.20650044 0.21112111 0.21905675
 0.21890731 0.28083846 0.28506348 0.25444612 0.26130271 0.21412544
 0.22421709 0.31940094 0.26021859 0.25422666 0.25637811 0.26111519
 0.25432348 0.2549746  0.23676789 0.24234171 0.21339691 0.21596204
 0.22137748 0.23054464 0.22863907 0.24747567 0.2511619  0.41270143
 0.37369168 0.34848073 0.34987396 0.33598191 0.301696   0.32470283
 0.25993899 0.274313   0.27612472 0.28873286 0.2805832  0.31392911
 0.31560937 0.32261878 0.22159612 0.22599231 0.23575321 0.23855036
 0.25413197 0.25367594 0.38418293 0.37775585 0.33732522 0.31590858
 0.29767811 0.29303577 0.28320315 0.27516082 0.27641359 0.23129664
 0.23834883 0.21161032 0.21596734 0.22453751 0.22301143 0.30827102
 0.21159795 0.22021282 0.21954472 0.34535113 0.35239625 0.35511994
 0.32408279 0.28279042 0.23522983 0.24028644 0.21574846 0.20729105
 0.61605984 0.56331897 0.5865168  0.31767115 0.31035107 0.31543368
 0.30069813 0.22750434 0.23382279 0.25091901 0.20315695 0.29692462
 0.31975964 0.25838256 0.26442173 0.27373833 0.29541179 0.28139332
 0.26825079 0.26092178 0.26077685 0.25012425 0.2202124  0.22541168
 0.23433635 0.23305322 0.33503246 0.32185015 0.25285798 0.24964677
 0.2598241  0.25527036 0.278447   0.24932286 0.2493356  0.23826431
 0.20558971 0.21667859 0.21929044 0.22001571 0.22282338 0.25724959
 0.24824175 0.25913855 0.25533524 0.24842845 0.25208917 0.21673635
 0.2158439  0.23223656 0.23450227 0.24199788 0.2189825  0.22619511
 0.23028909 0.23981237 0.33061889 0.34074208 0.25632203 0.25662312
 0.25485447 0.25432298 0.25727281 0.24857452 0.31808802 0.33536226
 0.33652997 0.29732421 0.2863625  0.28704148 0.28352326 0.20469727
 0.21174674 0.22587076 0.22490767 0.24445687 0.57297325 0.51557553
 0.38941681 0.39309409 0.36971691 0.37238046 0.3587923  0.35336846
 0.34713376 0.37334967 0.35292655 0.33482686 0.33296463 0.27125251
 0.27162254 0.27852079 0.26927027 0.32501134 0.29617736 0.29935875
 0.33216527 0.2662203  0.27790329 0.22736059 0.23850845 0.24296547
 0.25318784 0.25010827 0.27416778 0.27140418 0.2756381  0.20764013
 0.36495757 0.43122065 0.40466738 0.35024962 0.30580261 0.31325248
 0.30338103 0.2884939  0.25766206 0.2495524  0.25141993 0.25938681
 0.26575315 0.2789005  0.24528623 0.26037687 0.21727818 0.22237226
 0.23856381 0.24709143 0.23905627 0.21883735 0.25635511 0.2533136
 0.35292608 0.33928439 0.30628979 0.29855192 0.29035416 0.25236255
 0.22433582 0.22917284 0.19573942 0.20172459 0.20956327 0.31597561
 0.28244427 0.24325204 0.55676848 0.26156551 0.29082862 0.30512604
 0.29294446 0.31788752 0.30454284 0.28560653 0.29156631 0.26292467
 0.22045697 0.22943297 0.22910525 0.66076291 0.66481692 0.37632996
 0.34571326 0.35369718 0.33061823 0.26252958 0.2402494  0.2479616
 0.20795862 0.2169023  0.22246191 0.22273397 0.22512799 0.22663754
 0.2210357  0.30950347 0.30273956 0.30383134 0.25638649 0.21410161
 0.21498749 0.21831912 0.21932752 0.31034491 0.27880758 0.29344639
 0.23686045 0.24504805 0.25440973 0.27262527 0.26320952 0.36136803
 0.232476   0.2376782  0.20002316 0.20629324 0.32668373 0.3284497
 0.34079763 0.31718811 0.32848206 0.34067583 0.30829489 0.22986214
 0.23639268 0.20071347 0.26734936 0.26254615 0.27963638 0.26161137
 0.26144174 0.58625197 0.61932492 0.6418249  0.65203184 0.55681706
 0.48492041 0.60647321 0.63564318 0.59897769 0.51164603 0.54859662
 0.57059056 0.58095372 0.5223909  0.3992306  0.40737543 0.28799811
 0.28969649 0.27392066 0.27393797 0.49002594 0.4638792  0.44450817
 0.31915247 0.30500919 0.321823   0.32633609 0.28017587 0.27549517
 0.27779677 0.28359938 0.30609456 0.46935394 0.45348999 0.47369942
 0.46379399 0.46901482 0.42006519 0.44471785 0.41451019 0.38210314
 0.40114301 0.39291236 0.38078505 0.41120696 0.39208743 0.36414743
 0.35640842 0.30877671 0.2845954  0.43947971 0.3112961  0.24553926
 0.2150799  0.21568069 0.36453584 0.21806972 0.21823958 0.22046137
 0.28561649 0.2664763  0.26872444 0.60024041 0.5905835  0.5613001
 0.54708678 0.22394055 0.2188132  0.32190937 0.31868052 0.31403664
 0.29173616 0.28588563 0.25863707 0.26864409 0.24694167 0.21090248
 0.21150878 0.21345171 0.21802896 0.2143566  0.64384705 0.39456308
 0.37375858 0.23093073 0.24458161 0.21464805 0.21454914 0.34470671
 0.35602745 0.26501671 0.25868416 0.26323473 0.2379147  0.28562278
 0.21017772 0.21093757 0.21405537 0.30592406]
rule_of_law_std [       nan 0.3505094  0.32727668 ... 0.147108   0.1469944  0.14231273]
regulatory_quality_std [       nan 0.38636038 0.44084185 ... 0.23550989 0.17099454 0.17649963]
voice_and_accountability_std [       nan 0.26145712 0.25608963 ... 0.12136008 0.11163896 0.11021758]
In [219]:
CountryWiseData.columns
Out[219]:
Index(['agricultural_land%', 'forest_land%', 'land_area', 'avg_precipitation',
       'trade_in_services%', 'control_of_corruption_estimate',
       'control_of_corruption_std', 'access_to_electricity%',
       'renewvable_energy_consumption%', 'electric_power_consumption',
       'CO2_emisions', 'other_greenhouse_emisions', 'population_density',
       'inflation_annual%', 'real_interest_rate', 'risk_premium_on_lending',
       'research_and_development_expenditure%', 'central_goverment_debt%',
       'tax_revenue%', 'expense%', 'goverment_effectiveness_estimate',
       'goverment_effectiveness_std', 'human_capital_index', 'doing_business',
       'time_to_get_operation_license', 'statistical_performance_indicators',
       'individuals_using_internet%', 'logistic_performance_index',
       'military_expenditure%', 'GDP_current_US',
       'political_stability_estimate', 'political_stability_std',
       'rule_of_law_estimate', 'rule_of_law_std',
       'regulatory_quality_estimate', 'regulatory_quality_std',
       'government_expenditure_on_education%',
       'government_health_expenditure%',
       'multidimensional_poverty_headcount_ratio%', 'gini_index', 'birth_rate',
       'death_rate', 'life_expectancy_at_birth', 'population',
       'rural_population', 'voice_and_accountability_estimate',
       'voice_and_accountability_std', 'intentional_homicides'],
      dtype='object')
In [220]:
CountryWiseData.head(10)
Out[220]:
agricultural_land% forest_land% land_area avg_precipitation trade_in_services% control_of_corruption_estimate control_of_corruption_std access_to_electricity% renewvable_energy_consumption% electric_power_consumption CO2_emisions other_greenhouse_emisions population_density inflation_annual% real_interest_rate risk_premium_on_lending research_and_development_expenditure% central_goverment_debt% tax_revenue% expense% goverment_effectiveness_estimate goverment_effectiveness_std human_capital_index doing_business time_to_get_operation_license statistical_performance_indicators individuals_using_internet% logistic_performance_index military_expenditure% GDP_current_US political_stability_estimate political_stability_std rule_of_law_estimate rule_of_law_std regulatory_quality_estimate regulatory_quality_std government_expenditure_on_education% government_health_expenditure% multidimensional_poverty_headcount_ratio% gini_index birth_rate death_rate life_expectancy_at_birth population rural_population voice_and_accountability_estimate voice_and_accountability_std intentional_homicides
country
Afghanistan 58.18 1.85 652230.00 327.00 12.65 -1.43 0.22 55.82 27.04 NaN 4821.06 20799.12 27.34 6.12 10.00 NaN NaN NaN 7.80 41.62 -1.50 0.26 0.39 173.00 13.75 44.85 4.76 1.97 1.65 7717726754.86 -2.48 0.30 -1.72 0.21 -1.50 0.23 2.69 0.54 50.55 NaN 47.67 18.23 47.92 18410104.44 14513617.44 -1.30 0.16 6.02
Africa Eastern and Southern 43.69 33.45 14632485.85 NaN 9.83 NaN NaN 30.99 63.33 716.08 452640.56 1116224.04 23.42 10.38 NaN NaN 0.62 NaN 17.78 24.07 NaN NaN NaN NaN 13.24 NaN 6.43 2.48 2.52 366730373780.84 NaN NaN NaN NaN NaN NaN 4.39 2.55 NaN NaN 42.79 14.90 51.89 351919799.95 251054684.00 NaN NaN 11.97
Africa Western and Central 35.83 21.19 9045959.88 NaN 9.66 NaN NaN 41.27 81.26 127.28 153913.71 643892.33 25.87 4.64 NaN NaN 0.15 NaN NaN NaN NaN NaN NaN NaN 32.29 NaN 9.42 2.41 1.79 258364273935.03 NaN NaN NaN NaN NaN NaN 2.82 0.80 NaN NaN 44.23 17.59 48.33 239756352.51 153870196.38 NaN NaN 9.83
Albania 42.15 28.50 27400.00 1485.00 23.31 -0.68 0.17 99.80 40.24 1167.18 3881.95 8614.21 99.59 15.06 2.44 6.20 0.12 66.81 16.43 24.20 -0.31 0.23 0.61 82.00 14.77 71.14 25.38 2.48 2.79 7480345027.69 -0.09 0.27 -0.55 0.17 0.02 0.21 3.37 2.62 47.60 31.01 22.85 7.73 71.50 2713063.16 1574154.24 0.03 0.15 6.40
Algeria 17.45 0.74 2381740.10 89.00 6.83 -0.64 0.18 99.06 0.32 599.61 105761.24 193574.39 10.72 8.60 1.90 5.82 0.26 NaN NaN NaN -0.56 0.21 0.53 157.00 19.30 48.79 16.35 2.46 3.15 71794434279.38 -1.23 0.26 -0.84 0.17 -0.96 0.23 6.45 3.52 NaN 34.37 33.52 10.73 61.42 25915130.46 10791909.30 -0.98 0.14 1.17
American Samoa 19.98 87.94 200.00 NaN NaN 0.88 0.41 NaN 0.11 NaN NaN NaN 213.19 NaN NaN NaN 0.32 NaN NaN NaN 0.46 0.44 NaN NaN NaN NaN 0.00 NaN NaN 601100000.00 1.01 0.36 1.18 0.44 0.26 0.43 14.72 NaN NaN NaN 17.42 4.72 NaN 42344.19 7513.62 0.95 0.49 7.18
Andorra 48.33 34.04 470.00 NaN 83.14 1.27 0.42 100.00 16.76 NaN 484.89 565.57 105.76 NaN NaN NaN NaN NaN NaN NaN 1.59 0.41 NaN NaN NaN NaN 44.98 NaN NaN 1659435438.30 1.36 0.37 1.34 0.40 1.30 0.42 2.75 4.49 NaN NaN 10.77 3.53 NaN 50010.68 5107.17 1.32 0.25 0.60
Angola 38.47 59.54 1246700.00 1010.00 22.19 -1.26 0.18 35.73 62.64 99.22 18463.87 66559.93 11.50 358.38 -9.02 11.00 0.03 NaN 16.95 21.98 -1.09 0.21 0.36 177.00 29.40 50.15 8.03 2.28 6.06 39726806446.37 -0.77 0.26 -1.27 0.17 -1.07 0.22 3.12 1.52 54.00 48.67 48.39 18.85 46.68 14847769.44 7520616.13 -1.15 0.14 4.44
Antigua and Barbuda 20.04 20.72 440.00 1030.00 73.38 0.83 0.37 98.15 0.14 NaN 372.44 774.68 162.56 2.00 7.65 4.74 NaN NaN NaN NaN 0.34 0.40 0.59 113.00 2.90 NaN 32.55 2.90 NaN 809735374.75 0.87 0.34 0.74 0.34 0.56 0.38 2.84 2.89 NaN NaN 20.73 6.62 73.19 71967.87 49358.71 0.55 0.24 9.30
Arab World 35.02 3.36 13534656.22 NaN 16.18 NaN NaN 85.23 5.85 1201.84 1246017.27 1957551.81 17.92 5.69 NaN NaN 0.64 NaN 5.33 23.23 NaN NaN NaN NaN 15.28 NaN 19.81 2.69 7.02 1108382062269.78 NaN NaN NaN NaN NaN NaN 3.89 2.47 NaN NaN 35.87 10.31 61.15 246121661.13 118509632.79 NaN NaN 4.23

Histograms¶

In [221]:
# Histograms for various numerical parameters
data=CountryWiseData.head(90)
for feature in numerical_columns.columns:
#     sns.histplot(df[column_name], kde=True, color='blue')

#     # Add labels and title
#     plt.xlabel(column_name)
#     plt.ylabel('Frequency')
#     plt.title(f'Distribution of {column_name}')
    plt.figure(figsize=(3,3))
    sns.histplot(CountryWiseData[feature],kde=True, color='blue')
    plt.xlabel(feature)
    plt.ylabel('Count')
#     plt.yticks(data.index.tolist())
    plt.title(f"Graph for {feature}")
    plt.show()

Conclusion¶

Majority of the indicators show a right skewed distribution which shows the disparity of resource distribution between rich countries and poor countries.

Scatter Plots¶

In [222]:
for feature in numerical_columns.columns:
    plt.figure(figsize=(10, 20))
    data=CountryWiseData.copy()
    plt.scatter(data[feature],data.index.tolist())
    plt.xlabel(feature)
    plt.ylabel('Countries')
    plt.title(f"Graph for {feature}")
#     plt.show()
C:\Users\Bilal\AppData\Local\Temp\ipykernel_18668\588317347.py:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  plt.figure(figsize=(10, 20))

Conclusion:¶

Not many conclusions can be drawn from this data since this type of represendtion is not Suitable for the given data.

Bar Graph¶

In [223]:
data=CountryWiseData.head(90)
for feature in numerical_columns.columns:
    plt.figure(figsize=(10, 20))
    plt.barh(data.index.tolist(),data[feature],height=0.2)
    plt.xlabel(feature)
    plt.ylabel('Countries')
    plt.yticks(data.index.tolist())
    plt.title(f"Graph for {feature}")
    plt.show()
    

Box Plots¶

In [224]:
for feature in  numerical_columns.columns:
    data=CountryWiseData.copy()
    if 0 in data[feature].unique():
        pass
    else:
        plt.figure(figsize=(4, 3))
        data[feature]=np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)

Conclusion¶

Using Box plots we can see that some data points are evently distributed while some data points have a lot of outliers. As a general trend, parameters which show a good quality of life have quite a few outliers showing disparity between first world countries and the rest of the world

Scatter plots to explore relationships between pairs of variables.¶

In [225]:
## Plotting data to understand the relationship of Agricultural Land and forest land 
CountryWiseData.head(5)
Out[225]:
agricultural_land% forest_land% land_area avg_precipitation trade_in_services% control_of_corruption_estimate control_of_corruption_std access_to_electricity% renewvable_energy_consumption% electric_power_consumption CO2_emisions other_greenhouse_emisions population_density inflation_annual% real_interest_rate risk_premium_on_lending research_and_development_expenditure% central_goverment_debt% tax_revenue% expense% goverment_effectiveness_estimate goverment_effectiveness_std human_capital_index doing_business time_to_get_operation_license statistical_performance_indicators individuals_using_internet% logistic_performance_index military_expenditure% GDP_current_US political_stability_estimate political_stability_std rule_of_law_estimate rule_of_law_std regulatory_quality_estimate regulatory_quality_std government_expenditure_on_education% government_health_expenditure% multidimensional_poverty_headcount_ratio% gini_index birth_rate death_rate life_expectancy_at_birth population rural_population voice_and_accountability_estimate voice_and_accountability_std intentional_homicides
country
Afghanistan 58.18 1.85 652230.00 327.00 12.65 -1.43 0.22 55.82 27.04 NaN 4821.06 20799.12 27.34 6.12 10.00 NaN NaN NaN 7.80 41.62 -1.50 0.26 0.39 173.00 13.75 44.85 4.76 1.97 1.65 7717726754.86 -2.48 0.30 -1.72 0.21 -1.50 0.23 2.69 0.54 50.55 NaN 47.67 18.23 47.92 18410104.44 14513617.44 -1.30 0.16 6.02
Africa Eastern and Southern 43.69 33.45 14632485.85 NaN 9.83 NaN NaN 30.99 63.33 716.08 452640.56 1116224.04 23.42 10.38 NaN NaN 0.62 NaN 17.78 24.07 NaN NaN NaN NaN 13.24 NaN 6.43 2.48 2.52 366730373780.84 NaN NaN NaN NaN NaN NaN 4.39 2.55 NaN NaN 42.79 14.90 51.89 351919799.95 251054684.00 NaN NaN 11.97
Africa Western and Central 35.83 21.19 9045959.88 NaN 9.66 NaN NaN 41.27 81.26 127.28 153913.71 643892.33 25.87 4.64 NaN NaN 0.15 NaN NaN NaN NaN NaN NaN NaN 32.29 NaN 9.42 2.41 1.79 258364273935.03 NaN NaN NaN NaN NaN NaN 2.82 0.80 NaN NaN 44.23 17.59 48.33 239756352.51 153870196.38 NaN NaN 9.83
Albania 42.15 28.50 27400.00 1485.00 23.31 -0.68 0.17 99.80 40.24 1167.18 3881.95 8614.21 99.59 15.06 2.44 6.20 0.12 66.81 16.43 24.20 -0.31 0.23 0.61 82.00 14.77 71.14 25.38 2.48 2.79 7480345027.69 -0.09 0.27 -0.55 0.17 0.02 0.21 3.37 2.62 47.60 31.01 22.85 7.73 71.50 2713063.16 1574154.24 0.03 0.15 6.40
Algeria 17.45 0.74 2381740.10 89.00 6.83 -0.64 0.18 99.06 0.32 599.61 105761.24 193574.39 10.72 8.60 1.90 5.82 0.26 NaN NaN NaN -0.56 0.21 0.53 157.00 19.30 48.79 16.35 2.46 3.15 71794434279.38 -1.23 0.26 -0.84 0.17 -0.96 0.23 6.45 3.52 NaN 34.37 33.52 10.73 61.42 25915130.46 10791909.30 -0.98 0.14 1.17

Relationship between Agri and Forest Land¶

In [226]:
plt.figure(figsize=(4, 3))
x=CountryWiseData['population']
y=CountryWiseData['rural_population']
plt.scatter(x,y,s=8)
plt.ylabel('Population')
plt.xlabel('Rural Population')
plt.title('Relationship between Total Population and Rural Population')

plt.show()

Relationship between countries Tax Revenues and Expenses¶

In [227]:
plt.figure(figsize=(4, 3))
x=CountryWiseData['individuals_using_internet%']
y=CountryWiseData['human_capital_index']
plt.scatter(x,y,s=8)
plt.xlabel('Percentage of People accessing internet')
plt.ylabel('Human Capital Index')
plt.title('Relationship between Human Capital Index and Internet Accessiblity')

plt.show()

Relationship between Control of corruption and Accountablity standards¶

In [228]:
plt.figure(figsize=(4, 3))
x=CountryWiseData['control_of_corruption_std']
y=CountryWiseData['voice_and_accountability_std']
plt.scatter(x,y,s=8)
plt.xlabel('Control of Corruption')
plt.ylabel('Voice and Accountablity')
plt.title('Relationship between Corruption Control and Accountablity')

plt.show()

Conclusion¶

In this part I have tried to explore the relation between the Human Capital Index (HCI) with different parameters available in the dataset, a general trend shown with parameters which correlate to a higher quality of life such as access to electricity and internet, longers life expectency have a directly propotional relation with HCI. While other parameters such as Percentage of Forest Land, and Land Area have low corrleation with HCI

In [ ]:
 

Task 3.¶

Handling Missing Data:¶

Explore the patterns of missing data across variables.¶

Decide on an appropriate strategy for handling missing values (imputation, removal, etc.).¶

Exploring Missing Data¶

In [229]:
df = CountryWiseData.copy()
for column_name in CountryWiseData.columns:
    df[column_name].replace('None', pd.NA, inplace=True)

    # Convert the column to a numeric type (if it's not already)
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')

    # Calculate the median
    median_value = df[column_name].median()

    # Impute NaN values with the median
    df[column_name].fillna(median_value, inplace=True)
In [230]:
dataset=CountryWiseData.copy()
features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean(), 2),  '\b% missing values')

print()
print()
print()

    
features_with_nan = [feature for feature in dataset.columns if dataset[feature].isna().sum() > 1]

for feature in features_with_nan:
    print(f"{feature}: {np.round(dataset[feature].isna().mean() * 100, 2)}% NaN values")
agricultural_land% 0.04 % missing values
forest_land% 0.02 % missing values
land_area 0.02 % missing values
avg_precipitation 0.32 % missing values
trade_in_services% 0.08 % missing values
control_of_corruption_estimate 0.24 % missing values
control_of_corruption_std 0.24 % missing values
access_to_electricity% 0.01 % missing values
renewvable_energy_consumption% 0.03 % missing values
electric_power_consumption 0.31 % missing values
CO2_emisions 0.11 % missing values
other_greenhouse_emisions 0.11 % missing values
population_density 0.02 % missing values
inflation_annual% 0.1 % missing values
real_interest_rate 0.45 % missing values
risk_premium_on_lending 0.68 % missing values
research_and_development_expenditure% 0.3 % missing values
central_goverment_debt% 0.5 % missing values
tax_revenue% 0.27 % missing values
expense% 0.31 % missing values
goverment_effectiveness_estimate 0.24 % missing values
goverment_effectiveness_std 0.24 % missing values
human_capital_index 0.35 % missing values
doing_business 0.29 % missing values
time_to_get_operation_license 0.26 % missing values
statistical_performance_indicators 0.35 % missing values
individuals_using_internet% 0.03 % missing values
logistic_performance_index 0.19 % missing values
military_expenditure% 0.21 % missing values
GDP_current_US 0.02 % missing values
political_stability_estimate 0.24 % missing values
political_stability_std 0.24 % missing values
rule_of_law_estimate 0.24 % missing values
rule_of_law_std 0.24 % missing values
regulatory_quality_estimate 0.24 % missing values
regulatory_quality_std 0.24 % missing values
government_expenditure_on_education% 0.07 % missing values
government_health_expenditure% 0.11 % missing values
multidimensional_poverty_headcount_ratio% 0.75 % missing values
gini_index 0.38 % missing values
birth_rate 0.01 % missing values
death_rate 0.01 % missing values
life_expectancy_at_birth 0.03 % missing values
population 0.01 % missing values
rural_population 0.02 % missing values
voice_and_accountability_estimate 0.24 % missing values
voice_and_accountability_std 0.24 % missing values
intentional_homicides 0.09 % missing values



agricultural_land%: 4.1% NaN values
forest_land%: 2.24% NaN values
land_area: 1.87% NaN values
avg_precipitation: 32.09% NaN values
trade_in_services%: 7.84% NaN values
control_of_corruption_estimate: 24.25% NaN values
control_of_corruption_std: 24.25% NaN values
access_to_electricity%: 1.49% NaN values
renewvable_energy_consumption%: 2.99% NaN values
electric_power_consumption: 30.6% NaN values
CO2_emisions: 10.82% NaN values
other_greenhouse_emisions: 10.82% NaN values
population_density: 1.87% NaN values
inflation_annual%: 10.45% NaN values
real_interest_rate: 44.78% NaN values
risk_premium_on_lending: 67.54% NaN values
research_and_development_expenditure%: 29.85% NaN values
central_goverment_debt%: 50.37% NaN values
tax_revenue%: 26.87% NaN values
expense%: 31.34% NaN values
goverment_effectiveness_estimate: 24.25% NaN values
goverment_effectiveness_std: 24.25% NaN values
human_capital_index: 35.07% NaN values
doing_business: 29.48% NaN values
time_to_get_operation_license: 25.75% NaN values
statistical_performance_indicators: 35.07% NaN values
individuals_using_internet%: 2.61% NaN values
logistic_performance_index: 19.03% NaN values
military_expenditure%: 20.9% NaN values
GDP_current_US: 2.24% NaN values
political_stability_estimate: 23.51% NaN values
political_stability_std: 23.51% NaN values
rule_of_law_estimate: 23.51% NaN values
rule_of_law_std: 23.51% NaN values
regulatory_quality_estimate: 24.25% NaN values
regulatory_quality_std: 24.25% NaN values
government_expenditure_on_education%: 6.72% NaN values
government_health_expenditure%: 11.19% NaN values
multidimensional_poverty_headcount_ratio%: 75.0% NaN values
gini_index: 37.69% NaN values
birth_rate: 1.12% NaN values
death_rate: 1.12% NaN values
life_expectancy_at_birth: 2.99% NaN values
population: 1.12% NaN values
rural_population: 1.87% NaN values
voice_and_accountability_estimate: 23.51% NaN values
voice_and_accountability_std: 23.51% NaN values
intentional_homicides: 8.96% NaN values

Strategy for handling missing values¶

As evident from the histograms drawn in Question 01, majority of the data points are right skewed i.e. majority of the data points are concnetrated towards the left.

In such a situtation the median imputation for filling in the missing values is a viable technique which will also preserve the skewed nature of the data.

In [231]:
df=CountryWiseData.copy()

numeric_columns = df.select_dtypes(include=['number']).columns

numeric_columns = [col for col in numeric_columns if col != 'country' and col !='date']

for column in numeric_columns:
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)
In [232]:
dataset=df
features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean(), 2),  '\b% missing values')

print()
print()
print()

    
features_with_nan = [feature for feature in dataset.columns if dataset[feature].isna().sum() > 1]

for feature in features_with_nan:
    print(f"{feature}: {np.round(dataset[feature].isna().mean() * 100, 2)}% NaN values")


In [233]:
# Putting new data in CountryWiseData DataFrame
CountryWiseDataNew=df.groupby('country').mean()

As we can see after cleaning, there are no NaN or None values in the data.¶

Task 4.¶

Outlier Detection and Treatment:¶

Identify and visualize outliers in numerical variables.¶

Decide whether to remove, transform, or treat outliers based on domain knowledge and analysis goals.¶

Plotting outliers using Boxplots¶

In [234]:
for feature in  df.columns:
    data=df.copy()
    if 0 in data[feature].unique():
        pass
    else:
        plt.figure(figsize=(4, 3))
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

Given the nature of the data we can't drop the outliers thus applying logarithmic transformation on the data¶

In [235]:
for feature in  df.columns:
    data=df.copy()
    if 0 in data[feature].unique():
        pass
    else:
        # Generate some sample data (replace this with your actual data)
        data1 = data[feature]
        data2 = np.log(data[feature])

        # Create a figure and a set of subplots
        fig, axs = plt.subplots(1, 2, figsize=(10, 5))

        # Create the first box plot
        axs[0].boxplot(data1)
        axs[0].set_title(f'{feature} boxplot')

        # Create the second box plot
        axs[1].boxplot(data2)
        axs[1].set_title(f'{feature} boxplot with logarithmic transformation')

        # Show the plots
        plt.tight_layout()
        plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)

Conclusion:¶

As we can see on comparative analysis the parameters which were previously not very understandable become much more correlated once we apply logarithmic transformation. One more key observation is that parameters which were previously understandable do get messed up after logarithmic transformation. Thus log transformation should be applied with caution and with thorough analysis.

Task 5.¶

Data Distribution Analysis:¶

Visualize the data distribution and assess skewness and kurtosis.¶

For data distribution analysis lets draw histograms for each numeric parameters to analyze its distribution¶

In [236]:
# Histograms for various numerical parameters
data=CountryWiseDataNew.copy()
for feature in numerical_columns.columns:
    plt.figure(figsize=(3,3))
    sns.histplot(CountryWiseData[feature],kde=True, color='blue')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.title(f"Graph for {feature}")
    plt.show()

Given the various histograms we can see that the majority of the parameters are right skewed while some parameters like "Access to Electricity" are left skewed.

The kurtosis of each feature is given below:¶

In [237]:
for feature in numerical_columns.columns:
    print(f"Kurtosis value for {feature} is {np.round(stats.kurtosis(CountryWiseDataNew[feature]),2)}.")
Kurtosis value for agricultural_land% is -0.56.
Kurtosis value for forest_land% is -0.12.
Kurtosis value for land_area is 29.57.
Kurtosis value for avg_precipitation is 0.9.
Kurtosis value for trade_in_services% is 19.85.
Kurtosis value for control_of_corruption_estimate is 0.38.
Kurtosis value for control_of_corruption_std is 32.65.
Kurtosis value for access_to_electricity% is -0.02.
Kurtosis value for renewvable_energy_consumption% is -0.71.
Kurtosis value for electric_power_consumption is 12.14.
Kurtosis value for CO2_emisions is 29.73.
Kurtosis value for other_greenhouse_emisions is 30.24.
Kurtosis value for population_density is 99.17.
Kurtosis value for inflation_annual% is 72.93.
Kurtosis value for real_interest_rate is 13.61.
Kurtosis value for risk_premium_on_lending is 45.4.
Kurtosis value for research_and_development_expenditure% is 4.8.
Kurtosis value for central_goverment_debt% is 78.48.
Kurtosis value for tax_revenue% is 128.69.
Kurtosis value for expense% is 150.04.
Kurtosis value for goverment_effectiveness_estimate is 0.12.
Kurtosis value for goverment_effectiveness_std is 29.4.
Kurtosis value for human_capital_index is 0.1.
Kurtosis value for doing_business is -0.44.
Kurtosis value for time_to_get_operation_license is 9.19.
Kurtosis value for statistical_performance_indicators is 0.59.
Kurtosis value for individuals_using_internet% is -0.12.
Kurtosis value for logistic_performance_index is 0.71.
Kurtosis value for military_expenditure% is 33.28.
Kurtosis value for GDP_current_US is 38.15.
Kurtosis value for political_stability_estimate is 0.34.
Kurtosis value for political_stability_std is 19.69.
Kurtosis value for rule_of_law_estimate is -0.16.
Kurtosis value for rule_of_law_std is 16.66.
Kurtosis value for regulatory_quality_estimate is 0.03.
Kurtosis value for regulatory_quality_std is 28.46.
Kurtosis value for government_expenditure_on_education% is 9.9.
Kurtosis value for government_health_expenditure% is 1.47.
Kurtosis value for multidimensional_poverty_headcount_ratio% is 10.3.
Kurtosis value for gini_index is 1.81.
Kurtosis value for birth_rate is -1.16.
Kurtosis value for death_rate is 0.93.
Kurtosis value for life_expectancy_at_birth is -0.73.
Kurtosis value for population is 27.87.
Kurtosis value for rural_population is 25.94.
Kurtosis value for voice_and_accountability_estimate is -0.44.
Kurtosis value for voice_and_accountability_std is 16.18.
Kurtosis value for intentional_homicides is 11.65.

Kurtosis Analysis:¶

We generally interpret kurtosis on the following basis:

  • If kurtosis_value > 0, the distribution has heavy tails and a high peak (leptokurtic).
  • If kurtosis_value == 0, the distribution is mesokurtic (normal).
  • If kurtosis_value < 0, the distribution has lighter tails and a lower peak (platykurtic).

A visual inspection of histogram of each parameter and is corresponding kurtosis verifies this obersvation that metrics with >0 kurtosis indeed have high peaks, metrics with<0 kurtosis have lower peaks, and metrics with kurtosis value closer to 0 are normally distributed.

Task 6.¶

Bivariate Analysis:¶

Analyze relationships between pairs of variables through scatter plots.¶

Exploring relationship of Other Metrics with Human Capital Index¶

In [238]:
data = CountryWiseData.copy()
for feature in numerical_columns.columns:
    if feature != 'human_capital_index':
        plt.figure(figsize=(4, 3))
        x=CountryWiseData['human_capital_index']
        y=CountryWiseData[feature]
        plt.scatter(x,y,s=8)
        plt.xlabel('human_capital_index')
        plt.ylabel(feature)
        plt.title(f'Relationship between human_capital_index and {feature}')

        plt.show()
    

Task 7.¶

Grouping and Aggregation:¶

Group data by categorical variables and calculate summary statistics within each group.¶

Explore differences or patterns between different groups.¶

In [239]:
# Grouping data on the basis of countires:
GroupedData=first_data.copy()
In [240]:
GroupedData=GroupedData.groupby('country').mean()
GroupedData
Out[240]:
agricultural_land% forest_land% land_area avg_precipitation trade_in_services% control_of_corruption_estimate control_of_corruption_std access_to_electricity% renewvable_energy_consumption% electric_power_consumption CO2_emisions other_greenhouse_emisions population_density inflation_annual% real_interest_rate risk_premium_on_lending research_and_development_expenditure% central_goverment_debt% tax_revenue% expense% goverment_effectiveness_estimate goverment_effectiveness_std human_capital_index doing_business time_to_get_operation_license statistical_performance_indicators individuals_using_internet% logistic_performance_index military_expenditure% GDP_current_US political_stability_estimate political_stability_std rule_of_law_estimate rule_of_law_std regulatory_quality_estimate regulatory_quality_std government_expenditure_on_education% government_health_expenditure% multidimensional_poverty_headcount_ratio% gini_index birth_rate death_rate life_expectancy_at_birth population rural_population voice_and_accountability_estimate voice_and_accountability_std intentional_homicides
country
Afghanistan 58.18 1.85 652230.00 327.00 12.65 -1.43 0.22 55.82 27.04 NaN 4821.06 20799.12 27.34 6.12 10.00 NaN NaN NaN 7.80 41.62 -1.50 0.26 0.39 173.00 13.75 44.85 4.76 1.97 1.65 7717726754.86 -2.48 0.30 -1.72 0.21 -1.50 0.23 2.69 0.54 50.55 NaN 47.67 18.23 47.92 18410104.44 14513617.44 -1.30 0.16 6.02
Africa Eastern and Southern 43.69 33.45 14632485.85 NaN 9.83 NaN NaN 30.99 63.33 716.08 452640.56 1116224.04 23.42 10.38 NaN NaN 0.62 NaN 17.78 24.07 NaN NaN NaN NaN 13.24 NaN 6.43 2.48 2.52 366730373780.84 NaN NaN NaN NaN NaN NaN 4.39 2.55 NaN NaN 42.79 14.90 51.89 351919799.95 251054684.00 NaN NaN 11.97
Africa Western and Central 35.83 21.19 9045959.88 NaN 9.66 NaN NaN 41.27 81.26 127.28 153913.71 643892.33 25.87 4.64 NaN NaN 0.15 NaN NaN NaN NaN NaN NaN NaN 32.29 NaN 9.42 2.41 1.79 258364273935.03 NaN NaN NaN NaN NaN NaN 2.82 0.80 NaN NaN 44.23 17.59 48.33 239756352.51 153870196.38 NaN NaN 9.83
Albania 42.15 28.50 27400.00 1485.00 23.31 -0.68 0.17 99.80 40.24 1167.18 3881.95 8614.21 99.59 15.06 2.44 6.20 0.12 66.81 16.43 24.20 -0.31 0.23 0.61 82.00 14.77 71.14 25.38 2.48 2.79 7480345027.69 -0.09 0.27 -0.55 0.17 0.02 0.21 3.37 2.62 47.60 31.01 22.85 7.73 71.50 2713063.16 1574154.24 0.03 0.15 6.40
Algeria 17.45 0.74 2381740.10 89.00 6.83 -0.64 0.18 99.06 0.32 599.61 105761.24 193574.39 10.72 8.60 1.90 5.82 0.26 NaN NaN NaN -0.56 0.21 0.53 157.00 19.30 48.79 16.35 2.46 3.15 71794434279.38 -1.23 0.26 -0.84 0.17 -0.96 0.23 6.45 3.52 NaN 34.37 33.52 10.73 61.42 25915130.46 10791909.30 -0.98 0.14 1.17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
West Bank and Gaza 79.33 1.59 6020.00 402.00 16.51 -0.26 0.28 99.54 17.25 NaN NaN NaN 558.20 2.96 2.84 NaN 0.30 NaN 8.36 12.43 -0.82 0.33 0.57 117.00 16.77 67.16 27.35 NaN NaN 9172903448.28 -1.77 0.32 -0.37 0.27 -0.38 0.35 5.11 NaN 24.00 34.52 37.09 4.29 71.79 3458736.48 907983.03 -0.87 0.19 1.35
World 36.36 31.78 129717210.21 NaN 9.53 NaN NaN 83.60 17.34 2104.54 28173346.98 38928765.52 41.34 5.36 NaN NaN 2.06 NaN 14.19 26.53 NaN NaN NaN NaN 31.47 NaN 21.50 2.87 3.47 32366182539682.54 NaN NaN NaN NaN NaN NaN 4.15 5.66 NaN NaN 25.52 9.89 64.60 5407739439.19 2920233053.79 NaN NaN 6.17
Yemen, Rep. 44.52 1.04 527970.00 167.00 10.41 -1.22 0.18 56.89 1.37 114.12 15957.08 33225.43 29.21 17.50 7.13 4.18 NaN NaN NaN NaN -1.33 0.22 0.37 187.00 6.75 39.09 6.75 2.40 5.63 18230152874.03 -2.11 0.26 -1.37 0.17 -1.00 0.21 6.49 1.56 NaN 35.47 46.35 14.83 54.78 15833245.87 11515379.17 -1.29 0.14 4.59
Zambia 28.79 62.69 743390.00 1020.00 11.79 -0.56 0.16 24.77 86.38 843.76 3320.69 28515.98 11.84 36.44 -0.49 3.77 0.05 105.35 15.50 18.25 -0.80 0.18 0.39 85.00 30.63 57.93 4.26 2.41 1.87 8035327954.94 0.21 0.26 -0.43 0.16 -0.55 0.19 3.79 1.83 NaN 52.69 45.67 13.65 52.34 9055472.62 5573550.30 -0.29 0.13 7.39
Zimbabwe 35.50 46.88 386850.00 657.58 9.63 -1.23 0.16 37.57 75.06 842.78 12378.40 29168.66 24.68 79.61 -20.57 NaN NaN NaN 14.83 18.91 -1.17 0.18 0.44 140.00 7.00 53.54 8.33 2.31 3.25 8333485469.06 -0.95 0.26 -1.50 0.15 -1.70 0.18 9.08 1.75 NaN 45.93 39.59 11.92 54.74 9667335.21 6857547.03 -1.30 0.13 8.34

268 rows × 48 columns

The differences between metrics wrt to countries have already been explored and discussed above in previous questions¶

Task 8.¶

Data Transformation:¶

Apply mathematical transformations (e.g., logarithmic or exponential transformations) to normalize data.¶

Convert categorical variables to numerical format using encoding techniques.¶

In [241]:
## Applying Log transformation on data while plotting box plots
In [242]:
for feature in  df.columns:
    data=df.copy()
    if 0 in data[feature].unique():
        pass
    else:
        # Generate some sample data (replace this with your actual data)
        data1 = data[feature]
        data2 = np.log(data[feature])

        # Create a figure and a set of subplots
        fig, axs = plt.subplots(1, 2, figsize=(10, 5))

        # Create the first box plot
        axs[0].boxplot(data1)
        axs[0].set_title(f'{feature} boxplot')

        # Create the second box plot
        axs[1].boxplot(data2)
        axs[1].set_title(f'{feature} boxplot with logarithmic transformation')

        # Show the plots
        plt.tight_layout()
        plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arraylike.py:397: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
In [ ]: